



clear
insheet using /home/Output/Bloom_622.csv	// This is the name of file that includes all Compustat data.

** Organize time
g yrandmo = floor(datadate/100)
g calyear = floor(datadate/10000)
g calmonth= yrandmo - calyear*100
/*
tab calmonth -> 83% of firm-year obs have calmonth >=7
*/
*
local fiscal = 0
if `fiscal' ==1 {
	g year = fyear
}
else if `fiscal' ==0 {
	g year = calyear
	replace year = calyear-1 if calmonth <6 
}
*
* Select years:
keep if year >=1990 & year <=2016
*
*
* Select vars:
keep gvkey year fyear naics indfmt conm fyr emp sale at ///
capx capxv sppe sppiv ppent ppegt capx_dc capxv_dc sppe_dc sppiv_dc ppent_dc ppegt_dc  
* 
* >> Make NET investment rate:
*    NOTES: (1) capx excludes acquisitions, whereas capxv includes them.
*           (2) sppe excludes sales from discont. ops, whereas sppiv includes them.
*           (3) ppent is net of accumulated depreciation, whereas ppegt is a gross measure.
*
g neti = .
replace neti =  capx - sppe if capx!=. & sppe!=.
replace neti = -sppe if capx==. & capx_dc==4
replace neti =  capx if sppe==. & sppe_dc==4
*
g netiv = .
replace netiv =  capxv - sppiv if capxv!=. & sppiv!=.
replace netiv = -sppiv if capxv==. & capxv_dc==4
replace netiv =  capxv if sppiv==. & sppiv_dc==4
*
* >> Deflate sales:
*    This is the nonfarm business output price deflator (BLS).
sort year gvkey 
save comptemp, replace
*
clear
set obs 37
g year = 1980
forval i = 1981(1)2016 {
	replace year = `i' if _n==`i'-1980+1
}
g deflator = 47.20  if _n==1
replace deflator = 51.72  if _n==2
replace deflator = 54.91  if _n==3
replace deflator = 56.84  if _n==4
replace deflator = 58.40  if _n==5
replace deflator = 60.19  if _n==6
replace deflator = 61.04  if _n==7
replace deflator = 62.18  if _n==8
replace deflator = 64.08  if _n==9
replace deflator = 66.41  if _n==10
replace deflator = 68.68  if _n==11
replace deflator = 70.83  if _n==12
replace deflator = 72.02  if _n==13
replace deflator = 73.70  if _n==14
replace deflator = 75.06  if _n==15
replace deflator = 76.39  if _n==16
replace deflator = 77.45  if _n==17
replace deflator = 78.79  if _n==18
replace deflator = 79.08  if _n==19
replace deflator = 79.63  if _n==20
replace deflator = 81.19  if _n==21
replace deflator = 82.50  if _n==22
replace deflator = 83.19  if _n==23
replace deflator = 84.33  if _n==24
replace deflator = 86.30  if _n==25
replace deflator = 89.21  if _n==26
replace deflator = 91.81  if _n==27
replace deflator = 93.66  if _n==28
replace deflator = 95.04  if _n==29
replace deflator = 95.36  if _n==30
replace deflator = 96.37  if _n==31
replace deflator = 98.24   if _n==32
replace deflator = 100.00  if _n==33
replace deflator = 101.44  if _n==34
replace deflator = 103.30  if _n==35
replace deflator = 104.08  if _n==36
replace deflator = 105.04  if _n==37
replace deflator = deflator / 100	/*2012$*/
*
sort year
merge 1:m year using comptemp
keep if _merge==3
drop _merge
replace sale = sale / deflator
sort gvkey year
save comptemp, replace
*
*
*
*
*
*
*
*
*
* (A) BUILD SAMPLE 
*
*************************************************
* (A.1) Address DUPLICATES
************************************************
 *
 *
 * Find firm-year duplicates that do NOT differ by indfmt, eg, one obs has indfmt==FS and the other ==INDL 
sort gvkey year indfmt
by gvkey year: g dups=_N
by gvkey year: g dup =_n
*
g fsfmt = 0
g indlfmt = 0
replace fsfmt = 1 if indfmt=="FS"
replace indlfmt = 1 if indfmt=="INDL"
by gvkey year: egen xfsfmt = sum(fsfmt)
by gvkey year: egen xindlfmt = sum(indlfmt)
tab dups if dup==1
list gvkey year if (xindlfmt>1 | xfsfmt >1) & dups==2 & dup==1
*
*
/*
        |  gvkey   year |
        |---------------|
207890. |  20096   2014 |
322057. |  66552   2010 |
        |---------------|
322059. |  66552   2011 |
322061. |  66552   2012 |
322063. |  66552   2013 |
322065. |  66552   2014 |
322067. |  66552   2015 |
        |---------------|
322069. |  66552   2016 |
331265. | 108728   2014 |
331517. | 108836   2011 |
333997. | 111486   2015 |
369531. | 148750   2016 |
        |---------------|
384093. | 162976   2013 |
404416. | 178735   2016 |
406852. | 179566   2016 |
417070. | 185690   2014 |
427369. | 294911   2015 |

W/ the exception of 66552, the remainder look like this:
*
gvkey	year	fyear	indfmt	at		emp			sale	mkvalt		datadate
20096	2014	.		INDL	.		.			.		.			20141231
20096	2014	2014	INDL	47.774	.			2.371	106.5813	20150331
*
gvkey	year	fyear	indfmt	at		emp			sale	mkvalt		datadate
108728	2014	.		INDL	.		.			.		.			20140630
108728	2014	2014	INDL	74.896	.			63.615	62.2722		20140831
*
gvkey	year	fyear	indfmt	at		emp			sale	mkvalt		datadate
108836	2011	.		INDL	.		.			.		.			20110930
108836	2011	2011	INDL	27.136	.			0		21.001		20111231
*
That is, one of the pair is missing fyear (and other vars). 
The GVKEY 66552 is BIO LAB NATURALS INC. It is missing data starting in  
2009, so the years where duplicate formats are reported can simply be dropped.
*/
*
*
if `fiscal' ==1 {
	drop if gvkey==66552 & year>=2009
}
if `fiscal' ==0 {
	drop if gvkey==66552 & year>=2009
	drop if gvkey==20096 & year==2014 & fyear==.
	drop if gvkey==108728 & year==2014 & fyear==.
	drop if gvkey==108836 & year==2011 & fyear==.
	drop if gvkey==111486 & year==2015 & fyear==.
	drop if gvkey==148750 & year==2016 & fyear==.
	drop if gvkey==162976 & year==2013 & fyear==.
	drop if gvkey==178735 & year==2016 & fyear==.	
	drop if gvkey==179566 & year==2016 & fyear==.	
	drop if gvkey==185690 & year==2014 & fyear==.		
	drop if gvkey==294911 & year==2015 & fyear==.	
}
*
keep gvkey year naics indfmt conm emp sale at ///
neti netiv ppent ppegt  
save temp, replace
*
*
*
* The remainder of duplicates differ on indfmt:
keep if indfmt=="FS"
rename neti neti_fs
rename netiv netiv_fs
rename ppent ppent_fs
rename ppegt ppegt_fs
rename emp emp_fs
rename sale sale_fs
rename at at_fs
rename conm conm_fs
rename naics naics_fs
*
keep gvkey year naics_fs conm_fs neti_fs netiv_fs ppent_fs ppegt_fs emp_fs sale_fs at_fs
sort gvkey year 
save temp1, replace
*
*
clear
use temp
keep if indfmt=="INDL"
rename neti neti_indl
rename netiv netiv_indl
rename ppent ppent_indl
rename ppegt ppegt_indl
rename emp emp_indl
rename sale sale_indl
rename at at_indl
rename conm conm_indl
rename naics naics_indl
*
keep gvkey year naics_indl conm_indl neti_indl netiv_indl ppent_indl ppegt_indl emp_indl sale_indl at_indl 
sort gvkey year
*
merge 1:1 gvkey year using temp1 
sort gvkey year
/*
fiscal = 0:
    Result                           # of obs.
    -----------------------------------------
    not matched                       276,177
        from master                   276,125  (_merge==1)
        from using                         52  (_merge==2)

    matched                            31,318  (_merge==3)
    -----------------------------------------
*/
*
*
*
/*
foreach x in emp neti ppent sale at {
	g `x'_dup1 = 0
	g `x'_dup2 = 0
	replace `x'_dup1 = 1 if `x'_indl!=. & `x'_fs!=.
	replace `x'_dup2 = 1 if `x'_indl!=. & `x'_fs!=.  &  ( (`x'_indl>0 & `x'_fs>0  & `x'_indl >=`x'_fs*0.995  & `x'_indl <=`x'_fs*1.005) | ///
														  (`x'_indl<0 & `x'_fs<0  & `x'_indl <=`x'_fs*0.995  & `x'_indl >=`x'_fs*1.005) )
	tab `x'_dup1 `x'_dup2
	*
	* What you find is that SALE and NETI *never* disagree across the INDL and FS formats. AT almost always agree. 
	* EMP disagrees 3.7% of the time (when both formats are populated). 
	* Discrepancies are most common for PPENT, which disagrees 6% 
	* of the time.
}
*/
*
*
*
g conm = conm_indl	/*this is populated under both INDL And FS*/
foreach x in naics neti netiv ppent ppegt emp sale at {
	* Prioritize INDL:
	g `x' = .
	replace `x' = `x'_indl if `x'_indl!=.
	replace `x' = `x'_fs if (`x'_indl==. | `x'_indl==0) & `x'_fs!=.
}
*
keep gvkey year conm naics neti netiv ppent ppegt emp sale at  
sort gvkey year
by gvkey: g obs =_N
by gvkey: g time=_n
sum year
sum year if time==1
save comptemp2, replace
/*
sum  year                /*Total # of firm-year obs*/
    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
        year |    307,495    2003.002    7.728422       1990       2016
		
sum  year if time==1     /*Total # of firms*/
    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
        year |     31,382    1998.984    8.772587       1990       2016
*/
*/



************************************************************************
* (A.2)    FILTERS: 
*       i) FLAG OBS W/ NO SALES, EMPLOYMENT  and 
*          IDENTIFY OUTLIERS AMONG REMAINING OBS. 
*
*      ii) IMPOSE OTHER BLOOM FILTERS
************************************************************************
*
*
* First, will need APL, Employment, and Sales GROWTH:
clear 
use comptemp2
*
g apl = sale*1000000/(emp*1000)
replace apl   = . if apl <=0	/*NOTE: apl already =. if emp==0. And, emp never <0.*/
replace emp   = . if emp <=0
replace sale  = . if sale<=0
g logapl = log(apl)
// Treat as missing if values are non-positive.
*
sort gvkey year
by gvkey: g L1_sale = sale[_n-1]
by gvkey: g saleg   = (sale - L1_sale) / (.5*(sale + L1_sale))
*
by gvkey: g L1_emp= emp[_n-1]
by gvkey: g avgemp= .5*(emp + L1_emp)
by gvkey: g demp  = emp - L1_emp
by gvkey: g empg  = demp/avgemp
*
by gvkey: g L1_apl  = apl[_n-1]
by gvkey: g aplg    = (apl - L1_apl) / (.5*(apl + L1_apl))
*
by gvkey: g dateg = year - year[_n-1]
replace aplg  = . if 1-.001>dateg | dateg>1+.001
replace empg  = . if 1-.001>dateg | dateg>1+.001
replace saleg = . if 1-.001>dateg | dateg>1+.001
// Treat growth rates as missing if years are not adjacent in calendar time.

*
*
* Next, make Investment Rate = net investment / lagged capital.
* We will need these calculations for Bloom sample. (Bloom 
* trims outliers based in part on changes in capital. The 
* idea is that abnormal changes in capital may 
* signal M&A.)
*
* First, *level* of net investment:
g netinv = neti		/*netiv*/
g ppe    = ppent	/*ppegt*/
*
* Second, make capital.
* i)  Identify continuous spells of reporting investment
*     We cannot extend capital (via perpetual inventory) across a break in reporting
g spelltime = 1 if time==1
replace spelltime = (dateg==1)*(netinv!=.)*spelltime[_n-1] + 1 if time >1
g brkpt = 0
replace brkpt = 1 if spelltime==1
by gvkey: g spell = sum(brkpt)		
*
*
* ii) Identify first obs with non-missing and non-zero ppe w/in each spell.
g hasppe = 999999999
replace hasppe = 1 if ppe !=. & ppe !=0
sort gvkey spell year
by gvkey spell: egen year0 = min(year*hasppe)
g kap0_  = 0
replace kap0_ = ppe if year==year0
by gvkey spell: egen kap0 = max(kap0_)
*
*
* iii) Apply Perpetual inventory method
local delta = .1
g cumneti = 0 if year ==year0	//  =. at all other years.
replace cumneti = (1-`delta')*cumneti[_n-1] + netinv if year > year0	// now fill in "." for year > year0.
g kap = ((1-`delta')^(year-year0))*kap0 + cumneti*(year >year0)			// =. for years in spell but < year0
by gvkey spell: g netirate = netinv /(.5*(kap+kap[_n-1]))				// =. for years in spell but < year0.



*
*
*
* Now, make SAMPLE RESTRICTIONS.
*
*
g eligible = 1
* eligible selects obs for final sample.
* It is updated as we impose filters.
*
* Trim outliers as in Bloom.
local pctlo = .5
local pcthi = 99.5
local varstotrim empg saleg netirate
foreach x of local varstotrim {
    egen phi_`x' = pctile(`x'), p(`pcthi')
	egen plo_`x' = pctile(`x'), p(`pctlo')
	g in`x' = 0
	replace in`x' = 1 if (plo_`x' <=`x' & `x'<= phi_`x') | `x'==.	// keep obs. if met
	replace eligible= eligible*in`x'
}
sum logapl 
sum logapl if eligible==1
*
* This flags year t as outlier if growth from t-1 to t exceeds Bloom's bounds. 
* If t is 2nd year of firm's panel, it may be that the outlier lies in year 1.
* To be conservative, drop time=1 if time=2 is ineligible. 
sort gvkey year 
by gvkey: g lead_dateg = dateg[_n+1]
by gvkey: g lead_eligible = eligible[_n+1]
replace eligible = 0 if time==1 & lead_eligible==0 & 1-.001<=lead_dateg & lead_dateg <=1+.001
*
sum logapl if eligible==1

*
*
* Bloom eliminates smaller firms, eg, 
* firms with min. employment >500 and min. sales >10 million.
* Note - this is done after outliers are flagged. This allows
* that what may seem large **w/in** the Bloom sample is not 
* necessarily suspect. Outliers are judged based on the 
* full sample.
by gvkey: egen minemp  = min( emp)
by gvkey: egen minsale = min(sale)
g insize = 0
replace insize = 1 if minemp >=.5 & minsale >=10 & minemp!=. & minsale!=.	//.1, 2
replace eligible = insize*eligible
sum logapl if eligible==1
*
*
*
* Sum stats
* a) Before filters
sum sale, detail		/*in millions*/
sum apl, detail
sum emp, detail			/*emp in thousands*/
*
* b) After filters
sum sale if eligible ==1, detail		/*in millions*/
sum apl if eligible ==1, detail
sum emp if eligible==1, detail			/*emp in thousands*/
*
sum aplg if eligible ==1, detail
sum empg if eligible==1, detail
*
*
*
***********************************************************************
* (B) AUTOCORRELATION of APL
***********************************************************************
*
*
* Set logapl to missing if ineligible according to criteria above:
sort gvkey year 
g logapl_  = logapl
replace logapl_ = . if eligible==0
*
*
save bloomsample, replace
*
*
* Balanced Panels:
qui forval d = 0(5)15 { 
	clear
	use bloomsample 
	
	local yrT = 2016
	local yr1 = 1990 + `d'
	g samplength = `yrT' - `yr1' +1
	by gvkey: egen fobs= sum( (logapl_!=.)*(year>=`yr1')*(year<=`yrT') )
	g insample = (fobs == samplength)
	keep if insample==1 & year >=`yr1'
	*
	* Set weight:
	by gvkey: egen totfemp = sum(emp)
	g favgemp = totfemp / fobs
	g weight = emp	// 1, emp, favgemp
	*
	* Dimensions of sample
	egen firmyrs = sum( (year>`yr1') )	// # of years of apl *growth* is yrT-yr1 +1
	sort year gvkey 
	by year:  g numfirms = _N			// since balanced, this is # of firms in every year.
	*
	* Demean:
	sort gvkey year
	by gvkey: egen eflogapl_ = sum(logapl_*weight)
	by gvkey: egen fweight   = sum(weight)
	replace eflogapl_ = eflogapl_ / fweight		// within firm mean
	sort year gvkey 
	by year:  egen etlogapl_ = sum(logapl_*weight)
	by year:  egen tweight   = sum(weight)
 	egen elogapl_ = sum(logapl_*weight)
	egen popwght2 = sum(weight)
	replace etlogapl_ = etlogapl_ / tweight		// within year mean
	replace elogapl_ = elogapl_ / popwght2 		// grand mean
	g hatlogapl_ = logapl_ - eflogapl_ - etlogapl_ + elogapl_ 
	*
	* Naive estimator:
	replace hatlogapl_ = sqrt(weight)*hatlogapl_
	sort gvkey year 
	by gvkey: g Lhatlogapl_ = hatlogapl_[_n-1]
	replace Lhatlogapl_ = 0 if Lhatlogapl_ ==.
	g argcov = hatlogapl_*Lhatlogapl_	
	g argvar = hatlogapl_^2
	egen autocov = sum(argcov)
	egen var0    = sum(argvar)
	egen popwght1 = sum( weight*(year>`yr1') )
	g naiverho   = (popwght2/popwght1)*autocov/var0
	*
	* Jackknife:
	drop popwght1 popwght2 var0 autocov 
	local yr2 = ceil((`yr1' + `yrT')/2)		// end of first half 
	g indicator1 = (year > `yr1')*(year <=`yr2')
	g indicator2 = (year >=`yr1')*(year <=`yr2')
	egen autocov = sum(argcov*indicator1)
	egen var0    = sum(argvar*indicator2)
	egen popwght1 = sum(weight*indicator1)
	egen popwght2 = sum(weight*indicator2)
	g rho1  = (popwght2/popwght1)*autocov/var0
	*
	drop indicator1 indicator2 popwght1 popwght2 var0 autocov 
	g indicator1 = (year >`yr2')*(year <=`yrT')		// second half
	g indicator2 = (year>=`yr2')*(year <=`yrT')
	egen autocov = sum(argcov*indicator1)
	egen var0    = sum(argvar*indicator2)
	egen popwght1 = sum(weight*indicator1)
	egen popwght2 = sum(weight*indicator2)
	g rho2  = (popwght2/popwght1)*autocov/var0
	*
	g correctrho = 2*naiverho - .5*(rho1+rho2)
	keep numfirms firmyrs naiverho correctrho 
	g year1 = `yr1'
	keep if _n==1
	if `d' >0 {
		append using autoest
	}
	save autoest, replace  
}


*
*
* Full (unbalanced) sample
clear
use bloomsample 

g weight = emp	// 1, emp
by gvkey: g Llogapl_= logapl_[_n-1]
by gvkey: egen fobs = sum( (logapl_!=.)*(Llogapl_!=.) )		// # of valid years for this firm
drop if fobs == 0
sum year if time==1 	// # of firms that remain
g numfirms = r(N) 
egen firmyrs = sum( (logapl_!=.)*(Llogapl_!=.) )	// obs unusable w/out apl and lagged apl
*
by gvkey: egen eflogapl_ = sum(logapl_*weight*(logapl_!=.))
by gvkey: egen fweight   = sum(weight*(logapl_!=.))
replace eflogapl_ = eflogapl_ / fweight		// within firm mean
sort year gvkey 
by year:  egen etlogapl_ = sum(logapl_*weight*(logapl_!=.))
by year:  egen tweight   = sum(weight*(logapl_!=.))
egen elogapl_ = sum(logapl_*weight*(logapl_!=.))
egen popwght2 = sum(weight*(logapl_!=.))
replace etlogapl_ = etlogapl_ / tweight		// within year mean
replace elogapl_ = elogapl_ / popwght2 		// grand mean
g hatlogapl_ = logapl_ - eflogapl_ - etlogapl_ + elogapl_ 
*	
replace hatlogapl_ = sqrt(weight)*hatlogapl_
replace hatlogapl_ = 0 if logapl_ == .
sort gvkey year 
by gvkey: g Lhatlogapl_ = hatlogapl_[_n-1]
replace Lhatlogapl_ = 0 if Lhatlogapl_ ==. 
g argcov = hatlogapl_*Lhatlogapl_	
g argvar = hatlogapl_^2	
egen autocov = sum(argcov)
egen var0    = sum(argvar)
egen popwght1= sum( weight*(logapl_!=.)*(Llogapl_!=.) )
g naiverho   = (popwght2/popwght1)*autocov/var0
*
*
*
* Append to results from balanced panels
keep numfirms firmyrs naiverho
g correctrho = .
g year1 = 1990
keep if _n==1
append using autoest
*
*
* Save to Excel file.
drop year1
mkmat numfirms firmyrs naiverho correctrho, matrix (automat)
putexcel set "/home/Output/Compustat-results.xlsx", sheet("Table F1", replace) modify
putexcel B2  = matrix(automat)
putexcel A1  = "Sample"
putexcel A2  = "1990-2016, unbalanced"
putexcel A3  = "2005-2016"
putexcel A4  = "2000-2016"
putexcel A5  = "1995-2016"
putexcel A6  = "1990-2016"
putexcel A10 = "Sample is balanced panel unless otherwise noted."
putexcel B1 = "Number of firms"
putexcel C1 = "Number of firm-years"
putexcel D1 = "Simple Estimator"
putexcel E1 = "Jackknife Estimator"
*
*
*
************************************************************************
* (C) DISTRIBUTION of APL GROWTH by EMPLOYMENT ADJUSTMENT
************************************************************************
*
*
*
clear 
use bloomsample 

g adjthreshold = .01*avgemp
replace adjthreshold = 1 if avgemp <100

* FULL SAMPLE
sum aplg if eligible==1 

* ADJUSTERS
g adjuster = 0
replace adjuster = 1 if demp >adjthreshold | demp <-1*adjthreshold
sum aplg if eligible==1 & adjuster==1
g std_adjn = r(sd)

* INACTIVE
g noadjuster = 0
replace noadjuster = 1 if demp <=adjthreshold & demp >=-1*adjthreshold
sum aplg if eligible==1 & noadjuster==1
g std_noadjn = r(sd)

keep if _n==1
keep std_noadjn std_adjn 
mkmat std_noadjn std_adjn, matrix (stdmat)
putexcel set "/home/Output/Compustat-results.xlsx", sheet("Table F2", replace) modify
putexcel B2 = matrix(stdmat)
putexcel A2  = "Standard deviation of APL growth"
putexcel B1  = "Nonadjusters"
putexcel C1  = "Adjusters"

